Libraries

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(skimr)
library(png)
library(grid)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Load data

load_csv <- function(csv_name, data_folder = 'data') {
  df <- read.csv(file.path(data_folder, csv_name))
  as_tibble(df)
}

inventories_df <- load_csv('inventories.csv')
inventory_sets_df <- load_csv('inventory_sets.csv')
sets_df <- load_csv('sets.csv')
themes_df <- load_csv('themes.csv')
inventory_minifigs_df <- load_csv('inventory_minifigs.csv')
minifigs_df <- load_csv('minifigs.csv')
inventory_parts_df <- load_csv('inventory_parts.csv')
colors_df <- load_csv('colors.csv')
parts_df <- load_csv('parts.csv')
elements_df <- load_csv('elements.csv')
part_categories_df <- load_csv('part_categories.csv')
part_relationships_df <- load_csv('part_relationships.csv')

Basic tables analysis

Data schema

Tables

Inventories

Data summary
id version set_num
1 1 7922-1
3 1 3931-1
4 1 6942-1
15 1 5158-1
16 1 903-1
17 1 850950-1
Name inventories_df
Number of rows 37265
Number of columns 3
_______________________
Column type frequency:
character 1
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
set_num 0 1 3 20 0 35644 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 61103.60 51380.10 1 14424 54379 88842 194312 ▇▆▂▂▂
version 0 1 1.09 0.58 1 1 1 1 16 ▇▁▁▁▁

Inventory sets

Data summary
inventory_id set_num quantity
35 75911-1 1
35 75912-1 1
39 75048-1 1
39 75053-1 1
50 4515-1 1
50 4520-1 2
Name inventory_sets_df
Number of rows 4358
Number of columns 3
_______________________
Column type frequency:
character 1
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
set_num 0 1 5 20 0 3171 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
inventory_id 0 1 52518.95 59063.13 35 8076 16423 98685 191576 ▇▁▁▂▁
quantity 0 1 1.81 5.67 1 1 1 1 60 ▇▁▁▁▁

Sets

Data summary
set_num name year theme_id num_parts img_url
001-1 Gears 1965 1 43 https://cdn.rebrickable.com/media/sets/001-1.jpg
0011-2 Town Mini-Figures 1979 67 12 https://cdn.rebrickable.com/media/sets/0011-2.jpg
0011-3 Castle 2 for 1 Bonus Offer 1987 199 0 https://cdn.rebrickable.com/media/sets/0011-3.jpg
0012-1 Space Mini-Figures 1979 143 12 https://cdn.rebrickable.com/media/sets/0012-1.jpg
0013-1 Space Mini-Figures 1979 143 12 https://cdn.rebrickable.com/media/sets/0013-1.jpg
0014-1 Space Mini-Figures 1979 143 2 https://cdn.rebrickable.com/media/sets/0014-1.jpg
Name sets_df
Number of rows 21880
Number of columns 6
_______________________
Column type frequency:
character 3
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
set_num 0 1 3 20 0 21880 0
name 0 1 2 93 0 18752 0
img_url 0 1 46 63 0 21880 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2007.76 13.96 1949 2001 2012 2018 2024 ▁▁▁▃▇
theme_id 0 1 441.97 215.53 1 273 497 608 752 ▃▃▃▇▇
num_parts 0 1 161.38 418.14 0 3 31 139 11695 ▇▁▁▁▁

Themes

Data summary
id name parent_id
1 Technic NA
3 Competition 1
4 Expert Builder 1
16 RoboRiders 1
17 Speed Slammers 1
18 Star Wars 1
Name themes_df
Number of rows 468
Number of columns 3
_______________________
Column type frequency:
character 1
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1 2 42 0 385 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 433.46 216.55 1 250.5 466 625.25 752 ▅▅▅▆▇
parent_id 145 0.69 360.64 197.19 1 186.0 411 512.50 697 ▅▃▂▇▂

Inventory Minifigs

Data summary
inventory_id fig_num quantity
3 fig-001549 1
4 fig-000764 1
19 fig-000555 1
25 fig-000574 1
26 fig-000842 1
26 fig-008641 1
Name inventory_minifigs_df
Number of rows 20858
Number of columns 3
_______________________
Column type frequency:
character 1
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
fig_num 0 1 10 10 0 13455 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
inventory_id 0 1 43010.44 52256.78 3 7869 15681 66834 194312 ▇▁▁▁▁
quantity 0 1 1.06 0.78 1 1 1 1 100 ▇▁▁▁▁

Minifigs

Data summary
fig_num name num_parts img_url
fig-000001 Toy Store Employee 4 https://cdn.rebrickable.com/media/sets/fig-000001.jpg
fig-000002 Customer Kid 4 https://cdn.rebrickable.com/media/sets/fig-000002.jpg
fig-000003 Assassin Droid, White 8 https://cdn.rebrickable.com/media/sets/fig-000003.jpg
fig-000004 Man, White Torso, Black Legs, Brown Hair 4 https://cdn.rebrickable.com/media/sets/fig-000004.jpg
fig-000005 Captain America with Short Legs 3 https://cdn.rebrickable.com/media/sets/fig-000005.jpg
fig-000006 Lloyd Avatar 5 https://cdn.rebrickable.com/media/sets/fig-000006.jpg
Name minifigs_df
Number of rows 13764
Number of columns 4
_______________________
Column type frequency:
character 3
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
fig_num 0 1 10 10 0 13764 0
name 0 1 1 148 0 13354 0
img_url 0 1 53 53 0 13764 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
num_parts 0 1 5.3 6.03 0 4 4 5 156 ▇▁▁▁▁

Inventory parts

Data summary
inventory_id part_num color_id quantity is_spare img_url
1 48379c01 72 1 f https://cdn.rebrickable.com/media/parts/photos/1/48379c01-1-e7daa845-2671-4737-8642-3b1574308155.jpg
1 48395 7 1 f https://cdn.rebrickable.com/media/parts/photos/7/48395-7-b9152acf-2fa5-4836-a04d-5b7fd39c2406.jpg
1 stickerupn0077 9999 1 f
1 upn0342 0 1 f
1 upn0350 25 1 f
3 2343 47 1 f https://cdn.rebrickable.com/media/parts/elements/3000240.jpg
Name inventory_parts_df
Number of rows 1180987
Number of columns 6
_______________________
Column type frequency:
character 3
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
part_num 0 1 1 20 0 51051 0
is_spare 0 1 1 1 0 2 0
img_url 0 1 0 117 8180 74266 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
inventory_id 0 1 50849.46 55136.94 1 9404 22838 87088 194312 ▇▂▁▂▁
color_id 0 1 131.78 862.38 -1 4 15 71 9999 ▇▁▁▁▁
quantity 0 1 3.37 9.95 1 1 2 4 3064 ▇▁▁▁▁

Colors

Data summary
id name rgb is_trans
-1 [Unknown] 0033B2 f
0 Black 05131D f
1 Blue 0055BF f
2 Green 237841 f
3 Dark Turquoise 008F9B f
4 Red C91A09 f
Name colors_df
Number of rows 263
Number of columns 4
_______________________
Column type frequency:
character 3
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1 3 28 0 263 0
rgb 0 1 6 6 0 223 0
is_trans 0 1 1 1 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 651.38 750.55 -1 83 1005 1070.5 9999 ▇▁▁▁▁

Parts

Data summary
part_num name part_cat_id part_material
003381 Sticker Sheet for Set 663-1 58 Plastic
003383 Sticker Sheet for Sets 618-1, 628-2 58 Plastic
003402 Sticker Sheet for Sets 310-3, 311-1, 312-3 58 Plastic
003429 Sticker Sheet for Set 1550-1 58 Plastic
003432 Sticker Sheet for Sets 357-1, 355-1, 940-1 58 Plastic
003434 Sticker Sheet for Set 575-2, 653-1, 460-1 58 Plastic
Name parts_df
Number of rows 52615
Number of columns 4
_______________________
Column type frequency:
character 3
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
part_num 0 1 1 20 0 52615 0
name 0 1 3 222 0 52103 0
part_material 0 1 4 16 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
part_cat_id 0 1 38.91 22.08 1 17 41 60 68 ▃▃▂▁▇

Elements

Data summary
element_id part_num color_id design_id
6443403 2277c01pr0009 1 2277
6300211 67906c01 14 67908
4566309 2564 0 2564
4275423 53657 1004 53657
6194308 92926 71 28967
6229123 26561 4 26561
Name elements_df
Number of rows 84138
Number of columns 4
_______________________
Column type frequency:
character 1
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
part_num 0 1 2 19 0 33765 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
element_id 0 1.00 5222065.12 1596842.63 9327 4259773.50 6057754 6262024.5 61532443 ▇▁▁▁▁
color_id 0 1.00 539.67 2044.86 -1 8.00 28 135.0 9999 ▇▁▁▁▁
design_id 23682 0.72 45569.87 30750.66 1001 18453.75 41748 75474.5 107520 ▇▆▅▅▃

Part Relationships

Data summary
rel_type child_part_num parent_part_num
P 3626cpr3662 3626c
P 87079pr9974 87079
P 3960pr9971 3960
R 98653pr0003 98086pr0003
R 98653pr0003 98088pat0003
R 98653pr0003 98089pat0003
Name part_relationships_df
Number of rows 29977
Number of columns 3
_______________________
Column type frequency:
character 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
rel_type 0 1 1 1 0 6 0
child_part_num 0 1 1 20 0 27139 0
parent_part_num 0 1 1 19 0 4725 0

Part categories

Data summary
id name
1 Baseplates
3 Bricks Sloped
4 Duplo, Quatro and Primo
5 Bricks Special
6 Bricks Wedged
7 Containers
Name part_categories_df
Number of rows 66
Number of columns 2
_______________________
Column type frequency:
character 1
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1 4 44 0 66 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 35.36 19.41 1 19.25 35.5 51.75 68 ▇▇▇▇▇

Dataframes size

All rows in dataframes:  1446639 
All columns in dataframes:  45 
All values in dataframes:  8099232 

Analysis deep dive

Themes

Themes sum of rows with NA value

## [1] 145

All NA values are in parent_id column

## # A tibble: 1 × 3
##      id  name parent_id
##   <int> <int>     <int>
## 1     0     0       145

Create themes_parent_df

## # A tibble: 145 × 2
##       id name             
##    <int> <chr>            
##  1     1 Technic          
##  2    22 Creator          
##  3    34 Make & Create    
##  4    50 Town             
##  5    52 City             
##  6   112 Racers           
##  7   126 Space            
##  8   147 Pirates          
##  9   155 Modular Buildings
## 10   158 Star Wars        
## # ℹ 135 more rows

Join themes_parent_df with themes_df, insert name_child into name_parent if name_parent is NA

## # A tibble: 468 × 3
##       id name_child             name_parent
##    <int> <chr>                  <chr>      
##  1     1 Technic                Technic    
##  2     3 Competition            Technic    
##  3     4 Expert Builder         Technic    
##  4    16 RoboRiders             Technic    
##  5    17 Speed Slammers         Technic    
##  6    18 Star Wars              Technic    
##  7    19 Supplemental           Technic    
##  8    20 Throwbot Slizer        Technic    
##  9    21 Universal Building Set Technic    
## 10    22 Creator                Creator    
## # ℹ 458 more rows

Sets

Sets sum of rows with NA value

## [1] 0

Correlation between year and num_parts

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Join sets with prepared themes

## # A tibble: 21,880 × 4
##     year num_parts name_child     name_parent
##    <int>     <int> <chr>          <chr>      
##  1  1965        43 Technic        Technic    
##  2  1979        12 Classic Town   Town       
##  3  1987         0 Lion Knights   Castle     
##  4  1979        12 Supplemental   Space      
##  5  1979        12 Supplemental   Space      
##  6  1979         2 Supplemental   Space      
##  7  1979        18 Supplemental   Space      
##  8  1979        15 Classic Castle Castle     
##  9  1965         3 Technic        Technic    
## 10  2013         4 Books          Books      
## # ℹ 21,870 more rows

156 Parent themes with number of sets and sum of parts

## # A tibble: 156 × 3
##    name_parent     count_sets sum_parts
##    <chr>                <int>     <int>
##  1 4 Juniors               54      2317
##  2 Adventurers             83      8458
##  3 Agents                  28     11970
##  4 Alpha Team              31      4313
##  5 Angry Birds              6      2388
##  6 Animal Crossing          5         0
##  7 Aquazone                37      6577
##  8 Architecture            60     39346
##  9 Atlantis                24      5323
## 10 Avatar                   9      5522
## # ℹ 146 more rows

Top 10 themes based on number of sets

## # A tibble: 10 × 3
##    name_parent             count_sets sum_parts
##    <chr>                        <int>     <int>
##  1 Gear                          3265      8653
##  2 Duplo                         1293     45232
##  3 Books                          981      8007
##  4 Star Wars                      948    348777
##  5 City                           870    183107
##  6 Collectible Minifigures        858      6025
##  7 Technic                        804    272966
##  8 Educational and Dacta          678    127674
##  9 Town                           677     84587
## 10 Friends                        561    121489

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

More analysis (interactive + animations)

Forecasting